Outline

Load Packages

Read in the data

hzip15 <- read_csv('./dash-austin-crime-report-2015/2014_Housing_Market_Analysis_Data_by_Zip_Code.csv',
            col_types = cols(`Zip Code` = col_character()))

dat <- read_csv('./dash-austin-crime-report-2015/Crime_Housing_Joined.csv',
                col_types = cols(Zip_Code_Crime = col_character(),
                                 Zip_Code_Housing = col_character(),
                                 Clearance_Date = col_date(format = '%d-%b-%y')))

Data Description

We shall refer to Crime_Housing_Joined.csv as the crime data set, and the 2014_Housing_Market_Analysis_Data_by_Zip_Code.csv as the housing data set. The crime data set contains data regarding specfic crimes that occured. For example, if the the type of crime, the zip code where the crime occurred and the lattitude and longitude of where the crime occurred. The variables generally have long names that roughly describe the variable. Details around some crimes are ommitted for some crimes due to the nature of those crime. For example, the coordinates are not provided for rapes. The housing data set provides housing and demographic information about thirty six zip codes within Austin, Texas. The names of the variables in both data sets are given with the code below. Using functions such as or created too much output for the report.

cat("Dimension of crime data set is", dim(dat), '\n')
## Dimension of crime data set is 38573 43
print("Crime Data set Variables")
## [1] "Crime Data set Variables"
print(names(dat))
##  [1] "Key"                                                                  
##  [2] "Council_District"                                                     
##  [3] "Highest_Offense_Desc"                                                 
##  [4] "Highest_NIBRS_UCR_Offense_Description"                                
##  [5] "Report_Date"                                                          
##  [6] "Location"                                                             
##  [7] "Clearance_Status"                                                     
##  [8] "Clearance_Date"                                                       
##  [9] "District"                                                             
## [10] "Zip_Code_Crime"                                                       
## [11] "Census_Tract"                                                         
## [12] "X_Coordinate"                                                         
## [13] "Y_Coordinate"                                                         
## [14] "Zip_Code_Housing"                                                     
## [15] "Populationbelowpovertylevel"                                          
## [16] "Medianhouseholdincome"                                                
## [17] "Non-WhiteNon-HispanicorLatino"                                        
## [18] "HispanicorLatinoofanyrace"                                            
## [19] "Populationwithdisability"                                             
## [20] "Unemployment"                                                         
## [21] "Largehouseholds(5+members)"                                           
## [22] "Homesaffordabletopeopleearninglessthan$50000"                         
## [23] "Rentalsaffordabletopeopleearninglessthan$25000"                       
## [24] "Rent-restrictedunits"                                                 
## [25] "HousingChoiceVoucherholders"                                          
## [26] "Medianrent"                                                           
## [27] "Medianhomevalue"                                                      
## [28] "Percentageofrentalunitsinpoorcondition"                               
## [29] "Percentchangeinnumberofhousingunits2000-2012"                         
## [30] "Ownerunitsaffordabletoaverageretail/serviceworker"                    
## [31] "Rentalunitsaffordabletoaverageretail/serviceworker"                   
## [32] "Rentalunitsaffordabletoaverageartist"                                 
## [33] "Ownerunitsaffordabletoaverageartist"                                  
## [34] "Rentalunitsaffordabletoaverageteacher"                                
## [35] "Ownerunitsaffordabletoaverageteacher"                                 
## [36] "Rentalunitsaffordabletoaveragetechworker"                             
## [37] "Ownerunitsaffordabletoaveragetechworker"                              
## [38] "Changeinpercentageofpopulationbelowpoverty2000-2012"                  
## [39] "Changeinmedianrent2000-2012"                                          
## [40] "Changeinmedianhomevalue2000-2012"                                     
## [41] "Percentageofhomeswithin1/4-mioftransitstop"                           
## [42] "Averagemonthlytransportationcost"                                     
## [43] "Percentageofhousingandtransportationcoststhatistransportation-related"
cat("Dimension of housing data set is", dim(hzip15), '\n')
## Dimension of housing data set is 37 30
print("Housing Data set Variables")
## [1] "Housing Data set Variables"
print(names(hzip15))
##  [1] "Zip Code"                                                                     
##  [2] "Population below poverty level"                                               
##  [3] "Median household income"                                                      
##  [4] "Non-White, Non-Hispanic or Latino"                                            
##  [5] "Hispanic or Latino, of any race"                                              
##  [6] "Population with disability"                                                   
##  [7] "Unemployment"                                                                 
##  [8] "Large households (5+ members)"                                                
##  [9] "Homes affordable to people earning less than $50,000"                         
## [10] "Rentals affordable to people earning less than $25,000"                       
## [11] "Rent-restricted units"                                                        
## [12] "Housing Choice Voucher holders"                                               
## [13] "Median rent"                                                                  
## [14] "Median home value"                                                            
## [15] "Percentage of rental units in poor condition"                                 
## [16] "Percent change in number of housing units, 2000-2012"                         
## [17] "Owner units affordable to average retail/service worker"                      
## [18] "Rental units affordable to average retail/service worker"                     
## [19] "Rental units affordable to average artist"                                    
## [20] "Owner units affordable to average artist"                                     
## [21] "Rental units affordable to average teacher"                                   
## [22] "Owner units affordable to average teacher"                                    
## [23] "Rental units affordable to average tech worker"                               
## [24] "Owner units affordable to average tech worker"                                
## [25] "Change in percentage of population below poverty, 2000-2012"                  
## [26] "Change in median rent, 2000-2012"                                             
## [27] "Change in median home value, 2000-2012"                                       
## [28] "Percentage of homes within 1/4-mi of transit stop"                            
## [29] "Average monthly transportation cost"                                          
## [30] "Percentage of housing and transportation costs that is transportation-related"

Data Cleaning

There are are two problems with the data sets. First, we have to map the coordinates provided in the crime data set to the coordinate reference system provided by the ggmap package. We’ll do that in the following code. Secondly, much of the data is numeric but in formats that caused the data to read in as character. This could have been avoided by reading the in the data with parser in the readr package. However, since since there dozens of such variables in both data sets, I chose to write my own function to clean the data, then apply the function columnwise using the function.

Transforming the Coordinate System

The following code uses the sp and rgdal packages to transform the coordinate reference system.

data.frame(lon = dat$X_Coordinate, lat = dat$Y_Coordinate) %>%
  filter(complete.cases(.)) -> df_coords

coordinates(df_coords) <- c("lon", "lat")
proj4string(df_coords) <- CRS("+init=esri:102739")
CRS.new <- CRS("+init=epsg:4326")

df_coords_trans <- spTransform(df_coords, CRS.new)

dat %>%
  filter(!(is.na(X_Coordinate) | is.na(Y_Coordinate))) %>%
  mutate(X_Coordinate = df_coords_trans@coords[ ,1],
         Y_Coordinate = df_coords_trans@coords[ ,2]) %>%
  select(Key, X_Coordinate, Y_Coordinate) -> dat_coords

dat %>% 
  select(-X_Coordinate, -Y_Coordinate) %>%
  full_join(dat_coords, by = "Key") -> dat

Convert data that is supposed to be numeric

The following code converts data to numeric by removing the symbols “%”“,”$“”, and “,”.

dat %>%
  select(-Key, -Council_District, -Highest_Offense_Desc, 
         -Highest_NIBRS_UCR_Offense_Description, -Report_Date, -Location,
         -Clearance_Status, -Clearance_Date, -District, -Zip_Code_Crime,
         -X_Coordinate, -Y_Coordinate, -Zip_Code_Housing) -> dat_num

my_sub <- function(x){
  x <- gsub("%", "", x)
  x <- gsub("\\$" ,"", x)
  x <- gsub(",", "", x)
  x <- as.numeric(x)
}

dat_mat <- apply(dat_num, 2, my_sub)
df_num <- tbl_df(dat_mat)

var_ix_2drop <- which(names(dat) %in% colnames(dat_mat))

dat %>%
  select(-var_ix_2drop) %>%
  bind_cols(df_num) -> dat

Here, we apply the same method to the housing data set.

hzip_mat <- apply(hzip15, 2, my_sub) 
hzip15 <- tbl_df(hzip_mat)
hzip15 %>%
  mutate(`Zip Code` = as.character(`Zip Code`))
## # A tibble: 37 x 30
##    `Zip Code` `Population below poverty level` `Median household income`
##         <chr>                            <dbl>                     <dbl>
##  1      78726                                9                     66096
##  2       <NA>                               19                     52431
##  3      78724                               38                     35711
##  4      78617                               18                     43957
##  5      78701                               20                     68152
##  6      78702                               33                     34734
##  7      78703                               10                     92606
##  8      78704                               21                     50248
##  9      78705                               66                     11917
## 10      78717                                3                     93305
## # ... with 27 more rows, and 27 more variables: `Non-White, Non-Hispanic
## #   or Latino` <dbl>, `Hispanic or Latino, of any race` <dbl>, `Population
## #   with disability` <dbl>, Unemployment <dbl>, `Large households (5+
## #   members)` <dbl>, `Homes affordable to people earning less than
## #   $50,000` <dbl>, `Rentals affordable to people earning less than
## #   $25,000` <dbl>, `Rent-restricted units` <dbl>, `Housing Choice Voucher
## #   holders` <dbl>, `Median rent` <dbl>, `Median home value` <dbl>,
## #   `Percentage of rental units in poor condition` <dbl>, `Percent change
## #   in number of housing units, 2000-2012` <dbl>, `Owner units affordable
## #   to average retail/service worker` <dbl>, `Rental units affordable to
## #   average retail/service worker` <dbl>, `Rental units affordable to
## #   average artist` <dbl>, `Owner units affordable to average
## #   artist` <dbl>, `Rental units affordable to average teacher` <dbl>,
## #   `Owner units affordable to average teacher` <dbl>, `Rental units
## #   affordable to average tech worker` <dbl>, `Owner units affordable to
## #   average tech worker` <dbl>, `Change in percentage of population below
## #   poverty, 2000-2012` <dbl>, `Change in median rent, 2000-2012` <dbl>,
## #   `Change in median home value, 2000-2012` <dbl>, `Percentage of homes
## #   within 1/4-mi of transit stop` <dbl>, `Average monthly transportation
## #   cost` <dbl>, `Percentage of housing and transportation costs that is
## #   transportation-related` <dbl>

Data Visualization

Visualize Crime in Austin, Tx

Now, let’s visualize the coordinates where each crime was commited for the crimes where the coordinates are provided.

dat %>%
  filter(!(is.na(X_Coordinate) | is.na(Y_Coordinate))) -> df_no_missing


df_no_missing %>%
  ggplot(aes(X_Coordinate, Y_Coordinate))+
    geom_point(aes(X_Coordinate, Y_Coordinate), alpha = 0.2, shape = 1)+
    geom_density2d()+
    labs(x = "Longitude", y = "Lattitude", title = 'All Crime in Austin, TX')+
    theme_grey() -> plt1

plt1

We can use the ggmap package to pull a map of Austin, Texas and overlay these points.

amap11 <- get_map(location = 'Austin, Texas', zoom = 11)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=Austin,+Texas&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Austin,%20Texas&sensor=false

Now, we can overlay the points in the previous plot with an actual map of Austin, Texas.

ggmap(amap11)+
  geom_point(aes(X_Coordinate, Y_Coordinate), data = df_no_missing,
             alpha = 0.2, shape = 1)+
  geom_density2d(aes(X_Coordinate, Y_Coordinate), data = df_no_missing)+
  labs(x = "Longitude", y = "Lattitude", title = 'All Crime in Austin, TX')
## Warning: Removed 896 rows containing non-finite values (stat_density2d).
## Warning: Removed 896 rows containing missing values (geom_point).

As shown in the warning message about 891 observations were ommited because of the zoom. I did try to overlay this with a zoom that would include all the observations, but this was not very insightful.

We can visualize the different types of crimes using the function with the variable Highest_NIBRS_UCR_Offense_Description.

ggmap(amap11)+
  geom_point(aes(X_Coordinate, Y_Coordinate), data = df_no_missing,
             alpha = 0.2, shape = 1)+
  geom_density2d(aes(X_Coordinate, Y_Coordinate), data = df_no_missing)+
  facet_wrap(~Highest_NIBRS_UCR_Offense_Description)+
  labs(x = "Longitude", y = "Lattitude", title = 'Crime in Austin, TX')
## Warning: Removed 896 rows containing non-finite values (stat_density2d).
## Warning: Removed 896 rows containing missing values (geom_point).

Visualize Crime Over Time

dat %>%
  select(Clearance_Date) %>%
  filter(complete.cases(.))%>%
  group_by(Clearance_Date) %>%
  summarise(Count = n()) %>%
  arrange(Clearance_Date) %>%
  ggplot(aes(Clearance_Date, Count))+
    geom_line()+
    labs(x='Clearance Date', y='Number of Crimes', title='Number of Crimes over Time')+
    theme_grey()

There seems to be a large downward trend after January 2016. Also, there seems be be very a cyclical pattern, perhaps this is due to the weekday?

dat %>%
  select(Clearance_Date) %>%
  filter(complete.cases(.))%>%
  mutate(Weekday = lubridate::wday(Clearance_Date, label = TRUE)) %>%
  group_by(Weekday) %>%
  summarise(Count = n()) %>%
  ggplot(aes(Weekday, Count))+
    geom_bar(stat = 'identity', width = 0.5)+
    labs(y='Number of Crimes', title = 'Number of Crimes by Weekday')+
    theme_grey()

There is considerably less crime on the weekend. Perhaps this may be due to data collection or perhaps crime occurrs during the weekdays for strategic purposes.

dat %>%
  select(Clearance_Date) %>%
  filter(complete.cases(.) & Clearance_Date < as.Date('010116', format='%m%d%y'))%>%
  mutate(Month = lubridate::month(Clearance_Date, label = TRUE)) %>%
  group_by(Month) %>%
  summarise(Count = n()) %>%
  ggplot(aes(Month, Count))+
    labs(y='Number of Crime by Month', title = 'Number of Crimes by Month')+
    geom_bar(stat = 'identity', width = 0.5)+
    theme_grey()

Analyzing Crime by Zip Code

First, there is a diparity between the zip codes in the data sets. All the zip codes in the crime data set (Crime_Housing_Joined.csv) are found in the zip code data set (2014_Housing_Market_Analysis_Data_by_Zip_Code.csv). However, not all the the zip codes found in the zip code data set are found the crime data set. This is shown in the following code.

d_zips <- unique(dat$Zip_Code_Crime)
h_zips <- unique(hzip15$`Zip Code`)
all(h_zips %in% d_zips) #All match here but
## [1] TRUE
all(d_zips %in% h_zips) #but not here
## [1] FALSE
not_fnd <- d_zips[which(!d_zips %in% h_zips)]
cat('The Zip codes', not_fnd, 'are not found in the housing data set')
## The Zip codes 78719 78653 78747 78613 78660 78736 78725 78652 78733 78737 78712 are not found in the housing data set
dat %>%
  rename(`Zip Code` = Zip_Code_Crime) %>%
  filter(`Zip Code` %in% hzip15$`Zip Code`) %>%
  group_by(`Zip Code`) %>%
  summarise(num_crimes = n()) %>%
  ungroup() %>%
  full_join(hzip15 %>%
              mutate(`Zip Code` = as.character(`Zip Code`)), by = "Zip Code") %>% 
  filter(!is.na(`Zip Code`)) -> hdat
hdat %>%
  ggplot(aes(reorder(`Zip Code`, num_crimes), num_crimes))+
    geom_bar(stat = 'identity', width = 0.5)+
    theme_grey()+
    coord_flip()+
    labs(y='Zip Code', x='Number of Crimes', title = 'Number of Crimes by Zip Code')

hdat %>%
  select(-`Zip Code`) %>%
  as.matrix() -> hmat

hmat_cor <- cor(hmat, use = 'pairwise.complete')
sort(hmat_cor[,'num_crimes'], decreasing = TRUE)
##                                                                    num_crimes 
##                                                                   1.000000000 
##                             Percentage of homes within 1/4-mi of transit stop 
##                                                                   0.580001979 
##                                     Owner units affordable to average teacher 
##                                                                   0.423820471 
##                                                Population below poverty level 
##                                                                   0.411416265 
##                                               Hispanic or Latino, of any race 
##                                                                   0.407429252 
##                          Homes affordable to people earning less than $50,000 
##                                                                   0.405675044 
##                                                    Population with disability 
##                                                                   0.335856977 
##                                 Owner units affordable to average tech worker 
##                                                                   0.335722494 
##                                                         Rent-restricted units 
##                                                                   0.335522168 
##                                     Rental units affordable to average artist 
##                                                                   0.296059290 
##                                                                  Unemployment 
##                                                                   0.279174734 
##                                  Percentage of rental units in poor condition 
##                                                                   0.273446534 
##                                    Rental units affordable to average teacher 
##                                                                   0.271934904 
##                                      Owner units affordable to average artist 
##                                                                   0.266701528 
##                      Rental units affordable to average retail/service worker 
##                                                                   0.264518528 
##                                Rental units affordable to average tech worker 
##                                                                   0.226189316 
##                        Rentals affordable to people earning less than $25,000 
##                                                                   0.213281649 
##                                        Change in median home value, 2000-2012 
##                                                                   0.188566845 
##                       Owner units affordable to average retail/service worker 
##                                                                   0.087700618 
##                                              Change in median rent, 2000-2012 
##                                                                   0.080470912 
##                                                Housing Choice Voucher holders 
##                                                                   0.077561134 
## Percentage of housing and transportation costs that is transportation-related 
##                                                                   0.058382881 
##                                                 Large households (5+ members) 
##                                                                  -0.003000815 
##                                             Non-White, Non-Hispanic or Latino 
##                                                                  -0.069126800 
##                                                                   Median rent 
##                                                                  -0.241793565 
##                                                             Median home value 
##                                                                  -0.280257309 
##                   Change in percentage of population below poverty, 2000-2012 
##                                                                  -0.335252227 
##                          Percent change in number of housing units, 2000-2012 
##                                                                  -0.351134331 
##                                           Average monthly transportation cost 
##                                                                  -0.375293889 
##                                                       Median household income 
##                                                                  -0.421947054

Let’s draw a correlation plot of the data to get an idea about some of the interactions in the data.

#Corplot
rownames(hmat_cor) <- NULL
colnames(hmat_cor) <- NULL
corrplot(hmat_cor)

The correlation plots shows many possible interactions between the data. This adds further caution against interpreting the raw correlations directly.

Clusters of Crime

I’ld like to know where most crimes occur in Austin, Texas. To do this, I will try to identify clusters of crime by using the kmeans algorithim on the coordinates. First, we have to create a data frame that only contains the coordinates.

df_no_missing %>%
  select(X_Coordinate, Y_Coordinate) -> coord_df

Next, we’ll try to identify the appropriate number of clusters by plotting the within sum of squares versus the number of clusters.

Nk <- 10
w <- numeric(Nk)
b <- numeric(Nk)
for (num_cent in 2:Nk){
  fit <- kmeans(coord_df, num_cent)
  w[num_cent] <- fit$tot.withinss
}

w <- w[-1]
qplot(2:Nk,w, xlab = 'Number of Clusters', ylab = "Within SS")

From this plot it appears 4 clusters will suffice.

kmeans_fit <- kmeans(coord_df, centers = 4)
cntrs <- kmeans_fit$centers

plt1 +
  geom_point(aes(cntrs[1,1], cntrs[1,2], col = 'red'))+
  geom_point(aes(cntrs[2,1], cntrs[2,2], col = 'red'))+
  geom_point(aes(cntrs[3,1], cntrs[3,2], col = 'red'))+
  geom_point(aes(cntrs[4,1], cntrs[4,2], col = 'red'))+
  scale_colour_discrete(name  ="Cluster", labels = NULL)

As the following code shows about 81% of crime is commited within about a 3.5 mile radius of one of these centers.

clust1 <- c(cntrs[1,1], cntrs[1,2])
clust2 <- c(cntrs[2,1], cntrs[2,2])
clust3 <- c(cntrs[3,1], cntrs[3,2])
clust4 <- c(cntrs[4,1], cntrs[4,2])


dist_clust1_miles <- numeric(nrow(coord_df))
dist_clust2_miles <- numeric(nrow(coord_df))
dist_clust3_miles <- numeric(nrow(coord_df))
dist_clust4_miles <- numeric(nrow(coord_df))

meters2miles <- 1/(1000*1.60934)

for(i in 1:nrow(coord_df)){
  x <- coord_df$X_Coordinate[i]
  y <- coord_df$Y_Coordinate[i]
  
  dist_clust1_miles[i] <- distHaversine(c(x,y), clust1) * meters2miles
  dist_clust2_miles[i] <- distHaversine(c(x,y), clust2) * meters2miles
  dist_clust3_miles[i] <- distHaversine(c(x,y), clust3) * meters2miles
  dist_clust4_miles[i] <- distHaversine(c(x,y), clust4) * meters2miles
  
}


dist_df <- tbl_df(data.frame(dist_clust1_miles, dist_clust2_miles,
                      dist_clust3_miles, dist_clust4_miles))


num_miles <- 3.5

dist_df %>%
  filter(dist_clust1_miles < num_miles 
         | dist_clust2_miles < num_miles
         | dist_clust3_miles < num_miles
         | dist_clust4_miles < num_miles) %>%
  nrow()/nrow(dist_df)
## [1] 0.8144335